{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download the model before attempting to run this NB\n",
    "# python -m spacy download xx_ent_wiki_sm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import spacy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "file_dir = '../data/ru/factRuEval-2016/devset'\n",
    "devset = 'devset_combined.txt'\n",
    "dev_path = os.path.join(file_dir, devset)\n",
    "\n",
    "with open(os.path.join(file_dir, 'book_1667.txt')) as f:\n",
    "    test_example = f.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "nlp = spacy.load('xx_ent_wiki_sm')\n",
    "doc = nlp(test_example)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">Как бороться с \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    Пхеньяном\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PER</span>\n",
       "</mark>\n",
       "? Высокопоставленный северокорейский перебежчик дал подробную инструкцию о том, как противостоять \n",
       "<mark class=\"entity\" style=\"background: #ff9561; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    КНДР\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">LOC</span>\n",
       "</mark>\n",
       ".</br></br>Экс-председатель \n",
       "<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    Верховного народного собрания\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
       "</mark>\n",
       " \n",
       "<mark class=\"entity\" style=\"background: #ff9561; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    КНДР\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">LOC</span>\n",
       "</mark>\n",
       " \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    Хван Чжан Еп\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PER</span>\n",
       "</mark>\n",
       ", бежавший из страны 13 лет назад и возглавляющий расстрельный список северокорейских перебежчиков, рассказал о том, как можно нивелировать угрозу со стороны \n",
       "<mark class=\"entity\" style=\"background: #ff9561; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    Пхеньяна\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">LOC</span>\n",
       "</mark>\n",
       ". По словам 87-летнего политика, надежды всего мира на то, что после смерти \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    Кима Чен Ира КНДР\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PER</span>\n",
       "</mark>\n",
       " падёт и сменит режим, тщетны. Как говорится, за ним придут другие. Ни к чему, кроме кровопролития, не приведёт и война с \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    Пхеньяном\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PER</span>\n",
       "</mark>\n",
       ". Самым эффективным способом воздействия на \n",
       "<mark class=\"entity\" style=\"background: #ff9561; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    КНДР\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">LOC</span>\n",
       "</mark>\n",
       " является идеологическое воздействие со стороны \n",
       "<mark class=\"entity\" style=\"background: #ff9561; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    Китая\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">LOC</span>\n",
       "</mark>\n",
       ". Было бы неплохо, отмечает бывший соратник северокорейского вождя, если бы \n",
       "<mark class=\"entity\" style=\"background: #ff9561; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    Поднебесная\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">LOC</span>\n",
       "</mark>\n",
       ", будучи главным политическим и экономическим союзником \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    Пхеньяна\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PER</span>\n",
       "</mark>\n",
       ", постепенно отвернулась бы от него. «Нужно обращаться к народу \n",
       "<mark class=\"entity\" style=\"background: #ff9561; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    Северной Кореи\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">LOC</span>\n",
       "</mark>\n",
       ", рассказывать о нарушениях прав человека, которые происходят в стране\n",
       "<mark class=\"entity\" style=\"background: #ff9561; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    »\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">LOC</span>\n",
       "</mark>\n",
       ", — цитирует \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    Хвана Чжан Епа Lenta.ru\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PER</span>\n",
       "</mark>\n",
       ".</br></br>Со своими стратегическими идеями \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    Хван\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PER</span>\n",
       "</mark>\n",
       " выступил во время визита в \n",
       "<mark class=\"entity\" style=\"background: #ff9561; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    США\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">LOC</span>\n",
       "</mark>\n",
       ". Во время поездки политика, за которым охотятся спецслужбы \n",
       "<mark class=\"entity\" style=\"background: #ff9561; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    Северной Кореи\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">LOC</span>\n",
       "</mark>\n",
       ", охраняли десятки американских полицейских. В своё время \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    Хван\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MISC</span>\n",
       "</mark>\n",
       " был идеологом северокорейского режима, многие также приписывают ему авторство «идей \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    Чучхе»\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PER</span>\n",
       "</mark>\n",
       ", разработчиком которых официально считается отец \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
       "    Кима Чен Ира Ким Ир Сен\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PER</span>\n",
       "</mark>\n",
       ".\n",
       "</div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "spacy.displacy.render(doc, style='ent', jupyter=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Actual Tags:\n",
      "\n",
      "28823 loc_name 15 9 826611 1  # 826611 Пхеньяном\n",
      "28824 loc_name 122 4 826624 1  # 826624 КНДР\n",
      "70437 geo_adj 45 15 826614 1  # 826614 северокорейский\n",
      "70438 job 61 10 826615 1  # 826615 перебежчик\n",
      "70439 job 45 26 826614 2  # 826614 826615 северокорейский перебежчик\n",
      "70352 org_name 146 29 826627 3  # 826627 826628 826629 Верховного народного собрания\n",
      "70353 org_descr 167 8 826629 1  # 826629 собрания\n",
      "70354 loc_name 176 4 826630 1  # 826630 КНДР\n",
      "70355 surname 181 4 826631 1  # 826631 Хван\n",
      "70356 name 186 4 826632 1  # 826632 Чжан\n",
      "70357 name 191 2 826633 1  # 826633 Еп\n",
      "70358 loc_name 351 8 826658 1  # 826658 Пхеньяна\n",
      "70359 surname 435 4 826674 1  # 826674 Кима\n",
      "70360 name 440 3 826675 1  # 826675 Чен\n",
      "70361 name 444 3 826676 1  # 826676 Ира\n",
      "70362 loc_name 448 4 826677 1  # 826677 КНДР\n",
      "70363 loc_name 574 9 826705 1  # 826705 Пхеньяном\n",
      "70364 loc_name 627 4 826712 1  # 826712 КНДР\n",
      "70365 loc_name 679 5 826718 1  # 826718 Китая\n",
      "70366 loc_name 760 11 826732 1  # 826732 Поднебесная\n",
      "70367 loc_name 827 8 826740 1  # 826740 Пхеньяна\n",
      "70368 loc_name 899 14 826753 2  # 826753 826754 Северной Кореи\n",
      "70369 surname 997 5 826770 1  # 826770 Хвана\n",
      "70370 name 1003 4 826771 1  # 826771 Чжан\n",
      "70371 name 1008 3 826772 1  # 826772 Епа\n",
      "70372 org_name 1012 8 826773 1  # 826773 Lenta.ru\n",
      "70373 job 129 16 826626 1  # 826626 Экс-председатель\n",
      "70374 job 719 8 826726 1  # 826726 соратник\n",
      "70375 job 728 22 826727 2  # 826727 826728 северокорейского вождя\n",
      "70376 job 745 5 826728 1  # 826728 вождя\n",
      "70387 surname 1056 4 826779 1  # 826779 Хван\n",
      "70388 loc_name 1088 3 826785 1  # 826785 США\n",
      "70390 loc_name 1151 14 826796 2  # 826796 826797 Северной Кореи\n",
      "70391 surname 1223 4 826807 1  # 826807 Хван\n",
      "70393 surname 1368 4 826828 1  # 826828 Кима\n",
      "70394 name 1373 3 826829 1  # 826829 Чен\n",
      "70395 name 1377 3 826830 1  # 826830 Ира\n",
      "70396 surname 1381 3 826831 1  # 826831 Ким\n",
      "70397 name 1385 2 826832 1  # 826832 Ир\n",
      "70398 name 1388 3 826833 1  # 826833 Сен\n",
      "70399 job 1110 8 826790 1  # 826790 политика\n",
      "\n"
     ]
    }
   ],
   "source": [
    "with open(os.path.join(file_dir, 'book_1667.spans')) as f:\n",
    "    print('Actual Tags:\\n')\n",
    "    print(f.read())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Summary\n",
    "This dataset is from factRuEval-2016. The \"labels\" in this dataset are somewhat unusual for NER - for example, the inclusion of \"job\" as a category, as well as the decision to split \"person\" into name, surname, and nickname. These are actually the data's \"span\" labels rather than \"object\" labels. Since the annotator markup of \"objects\" (direct analogues of entities) was done after the markup of spans, information about the actual indices/spans where the \"object\" is located was not preserved. Thus it was not possible to accurately and straightforwardly link objects to their locations in the text, and the span labels are the next best thing we have. \n",
    "\n",
    "This isn't actually a problem, since the span labels are arguably more granular and informative than the typical NER markup would be. It is only really a problem for assessing out-of-the-box performance for spaCy, since the model doesn't come with the ability to predict these labels. \n",
    "\n",
    "SpaCy's reported F1 scores for this model:\n",
    "\n",
    "NER F 79.88\n",
    "\n",
    "NER P 80.27\n",
    "\n",
    "NER R 79.49\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "nlp",
   "language": "python",
   "name": "nlp"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
